#!/usr/bin/env python
# coding: utf-8

# In[1]:


exec(open('init_path.py').read())
exec((P_Lib/'GasStation.py').read_text())
get_ipython().run_line_magic('matplotlib', 'inline')


# ### Build Universal Dataset of Gas Station

# In[2]:


# raw 1 & 2 (from data dump) + raw_azure (from Azure)
GS = read_hdf(P_GS_Data_Raw / 'gas_station_raw_1.h5', 'GS').append(read_hdf(P_GS_Data_Raw / 'gas_station_raw_2.h5', 'GS'))
del GS['Ver']; del GS['VerTime']; del GS['Holiday']
GS = GS.rename(columns={'Place':'City'}).append(read_hdf(P_GS_Data_Raw / 'gas_station_raw_azure.h5', 'GS'))
GS = GS.sort_values(['Lat','Lng']).reset_index(drop=True)
GS.loc[GS.Brand.isin(['nan',r'\N']), 'Brand'] = np.nan
GS['Post'] = to_numeric(GS.Post, errors='coerce')
print(GS.shape)
print(GS.ID.nunique())
GS.head(2)


# In[3]:


# German Boundary Box
bbox = (5.9, 47.3, 15.0169958839, 55.0159)
lng_min, lat_min, lng_max, lat_max = bbox
GS['InDE'] = (GS.Lat>=lat_min)&(GS.Lat<=lat_max)&(GS.Lng>=lng_min)&(GS.Lng<=lng_max)
GS.loc[~GS.InDE, ['Lat','Lng']] = np.nan


# In[4]:


# Drop Duplicates
GS = GS.drop_duplicates().reset_index(drop=True)
print(GS.shape)


# In[5]:


# Drop Duplicates by ID, keep higher score
GS['DataScore'] = GS[['Lat','Lng']].notnull().sum(axis=1) * 10 + GS[['Name','Brand','Lat','Lng']].notnull().sum(axis=1)
GS = GS.sort_values(['ID','DataScore'], ascending=False) # Descending Order
GS = GS.drop_duplicates(['ID'], keep='first') # Keep First
print(GS.shape)
GS.head(2)


# In[6]:


# 1. Missing StIDs: those exist in price history but not in gas station
# 2. StID is str and is too long; count all gas station appearance & new StID by nOccur
df_gs = DataFrame(columns=['ID','nOccur'])
files = list((P_GS_Data_Raw / 'Raw_HDF').glob('*.h5'))
files = sorted(files, key=lambda f: int(f.name.split('.')[0]))
len(files)


# In[11]:


def summarize_st_nOccur(f):
    df = read_hdf(f, 'GS')
    # Count StID Occurrence => Build Universal Gas Station Dataset (also shorten StID)
    return df.groupby('ID').e10.count().reset_index().rename(columns={'e10':'nOccur'})


# In[12]:


# ls = []
# for f in files[:10]:
#     ls.append(summarize_st_nOccur(f))
with Pool() as p:
    ls = p.map(summarize_st_nOccur, files)
#     ls = p.map(summarize_st_nOccur, files[:10])


# In[13]:


df_gs = concat(ls)
df_gs['nOccur'] = df_gs['nOccur'].astype(int)
df_gs = df_gs.groupby('ID').nOccur.sum().reset_index()
df_gs.to_hdf(P_GS_Data / 'GS' / 'gas_station_nOccur.h5', 'GS', mode='w', complevel=9, complib='blosc')
df_gs.head(2)


# In[248]:


df_gs = read_hdf(P_GS_Data / 'GS' / 'gas_station_nOccur.h5', 'GS')
df_gs.head(2)


# In[14]:


df = merge(GS, df_gs, how='outer') # merge two database tgt: 1. Raw Gas Station directly from the database
df['nOccur'] = df.nOccur.fillna(0).astype(int)
# 2. gas station occurrence in price history data
print(df.shape)
print(df.ID.nunique())
df.head(2)


# In[15]:


# Cleanup Brand
for col in ['Name','Brand','St','City']:
    df[col] = df[col].fillna('').str.lower().str.strip()
# for top brands, match first word of station name as a brand
s = df.Brand.value_counts()
ls_brands_11gs = [b for b in s[s>=11].index if (len(b)>=1)&(b[:4]!='frei')&(b[:10]!='tankstelle')]
idx = (df.Brand=='')&(df.Name!='')&(df.Name.str.split(' ').str[0].isin(ls_brands_11gs))
df.loc[idx, 'Brand'] = df.loc[idx, 'Name'].str.split(' ').str[0]
# Sort by nOccur & create StID
df['nOccur'] = df.nOccur.fillna(0).astype(int)
df = df.sort_values(by='nOccur', ascending=False).reset_index(drop=True).reset_index(drop=False).rename(columns={'index':'StID'})


# In[16]:


# More Cleanup of Brands
df.loc[df.Name.str[:2]=='bk', 'Brand'] = 'bk'
df.loc[df.Name=='','Name'] = df.loc[df.Name=='','StID']
df.loc[df.Brand=='','Brand'] = df.loc[df.Brand=='','StID']
df.head(2)


# In[17]:


del df['DataScore']
del df['InDE']
df['Brand'] = df.Brand.astype(str)
df['ID'] = df.ID.astype(str)
df['Name'] = df.Name.astype(str)
df['Brand'] = df.Brand.astype(str)
df['St'] = df.St.astype(str)
df['StNum'] = df.StNum.astype(str)
df['Post'] = df.Post.fillna(0).astype(int)
df['City'] = df.City.astype(str)


# In[18]:


df.to_hdf(P_GS_Data / 'GS' / 'gas_station.h5', 'GS', mode='w', complevel=9, complib='blosc')


# In[19]:


# df = read_hdf(P_GS_Data / 'GS' / 'gas_station.h5', 'GS')
# df.head(2)


# In[21]:


dict_gsid_to_stid = df.set_index('ID').StID.to_dict()
save_obj(dict_gsid_to_stid, P_GS_Data / 'GS' / 'dict_gsid_to_stid.pkl')
# dict_gsid_to_stid = load_obj(P_GS_Data / 'GS' / 'dict_gsid_to_stid.pkl')


# ### Gas Station + Brand Info (run Brands First)

# In[32]:


GS = read_hdf(P_GS_Data / 'GS' / 'gas_station.h5', 'GS')
GS['BrandID'] = GS.Brand.map(dict_brand_to_brandid)
GS['SizeGp'] = GS.Brand.map(dict_brand_to_sizegp)
GS.head(2)


# In[33]:


GS.to_hdf(P_GS_Data / 'GS' / 'gas_station.h5', 'GS', mode='w', complevel=9, complib='blosc')

GS.to_stata(P_GS_Data / 'GS' / 'gas_station.dta')

# In[2]:


# GS = read_hdf(P_GS_Data / 'GS' / 'gas_station.h5', 'GS')


# In[4]:


GS.head(10)


# In[35]:


dict_stid_to_sizegp = GS.set_index('StID').SizeGp.to_dict()
save_obj(dict_stid_to_sizegp, P_GS_Data / 'GS' / 'dict_stid_to_sizegp.pkl')
# dict_stid_to_sizegp = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_sizegp.pkl')


# In[36]:


dict_stid_to_brandid = GS.set_index('StID').BrandID.to_dict()
save_obj(dict_stid_to_brandid, P_GS_Data / 'GS' / 'dict_stid_to_brandid.pkl')
# dict_stid_to_brandid = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_brandid.pkl')


# ### Brands

# In[22]:


GS = read_hdf(P_GS_Data / 'GS' / 'gas_station.h5', 'GS')
GS.head(2)


# In[23]:


df = GS.groupby('Brand').StID.count().sort_values(ascending=False).reset_index().reset_index()
df.columns = ['BrandID','Brand','nGS']
print(df.shape)
df.head()


# In[24]:


# Brand with nGS>=500 retain BrandID as Gp, others keep going up
bins = IntervalIndex.from_tuples([(100,500),(10,100),(1,10),(0,1)])
df['SizeGp'] = cut(df.nGS, bins).cat.codes + (df.nGS>500).sum()
df.loc[df.nGS>500, 'SizeGp'] = df.loc[df.nGS>500, 'BrandID'] 
df.head()


# In[25]:


side_by_side(df.SizeGp.value_counts(), df.groupby('SizeGp').nGS.sum())


# In[26]:


df.to_hdf(P_GS_Data / 'GS' / 'gas_station_brand.h5', 'GS', mode='w', complevel=9, complib='blosc')


# In[27]:


# Brand = read_hdf(P_GS_Data / 'GS' / 'gas_station_brand.h5', 'GS')
# Brand.head(2)


# In[28]:


dict_brand_to_brandid = df.set_index('Brand').BrandID.to_dict()
save_obj(dict_brand_to_brandid, P_GS_Data / 'GS' / 'dict_brand_to_brandid.pkl')
# dict_brand_to_brandid = load_obj(P_GS_Data / 'GS' / 'dict_brand_to_brandid.pkl')


# In[29]:


dict_brand_to_brandid = load_obj(P_GS_Data / 'GS' / 'dict_brand_to_brandid.pkl')
dict_brandid_to_brand = reverse_dict(dict_brand_to_brandid)
save_obj(dict_brandid_to_brand, P_GS_Data / 'GS' / 'dict_brandid_to_brand.pkl')
# dict_brandid_to_brand = load_obj(P_GS_Data / 'GS' / 'dict_brandid_to_brand.pkl')


# In[30]:


dict_brand_to_sizegp = df.set_index('Brand').SizeGp.to_dict()
save_obj(dict_brand_to_sizegp, P_GS_Data / 'GS' / 'dict_brand_to_sizegp.pkl')
# dict_brand_to_sizegp = load_obj(P_GS_Data / 'GS' / 'dict_brand_to_sizegp.pkl')


# In[31]:


d1 = {k:v for k,v in zip(df.loc[df.nGS>500, 'SizeGp'].values, df.loc[df.nGS>500, 'Brand'].values)}
d2 = {k:v for k,v in zip(range(max(d1.keys()) + 1, max(d1.keys()) + 1 + 4), ['(100,500]','(10,100]','(1,10]','(0,1]'])}
dict_sizegp_to_names = {**d1, **d2}
save_obj(dict_sizegp_to_names, P_GS_Data / 'GS' / 'dict_sizegp_to_names.pkl')
# dict_sizegp_to_names = load_obj(P_GS_Data / 'GS' / 'dict_sizegp_to_names.pkl')


# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:




